Data Inspection¶
In [1]:
import plotly
plotly.offline.init_notebook_mode()
In [2]:
import pandas as pd
from tqdm import tqdm
df_list = list()
chunk_iter = pd.read_csv(
"../data/Total_Data_10Y_Top24.csv",
chunksize=100000,
dtype = {
"CANCELLATION_CODE": str,
}
)
for chunk in tqdm(chunk_iter):
df_list.append(chunk)
df = pd.concat(df_list)
df.head()
130it [00:30, 4.23it/s]
Out[2]:
| FL_DATE | OP_UNIQUE_CARRIER | TAIL_NUM | ORIGIN_AIRPORT_SEQ_ID | ORIGIN_CITY_MARKET_ID | ORIGIN | ORIGIN_CITY_NAME | DEST_AIRPORT_SEQ_ID | DEST_CITY_MARKET_ID | DEST | ... | TAXI_IN | ARR_TIME | ARR_DELAY | CANCELLED | CANCELLATION_CODE | CARRIER_DELAY | WEATHER_DELAY | NAS_DELAY | SECURITY_DELAY | LATE_AIRCRAFT_DELAY | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2014-07-01 | AA | N002AA | 1105703 | 31057 | CLT | Charlotte, NC | 1129803 | 30194 | DFW | ... | 28.0 | 1214.0 | 9.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 2014-07-01 | AA | N002AA | 1129803 | 30194 | DFW | Dallas/Fort Worth, TX | 1105703 | 31057 | CLT | ... | 13.0 | 945.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 2014-07-01 | AA | N004AA | 1039705 | 30397 | ATL | Atlanta, GA | 1129803 | 30194 | DFW | ... | 6.0 | 1341.0 | -9.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 2014-07-01 | AA | N004AA | 1129803 | 30194 | DFW | Dallas/Fort Worth, TX | 1039705 | 30397 | ATL | ... | 7.0 | 1159.0 | 4.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 2014-07-01 | AA | N004AA | 1129803 | 30194 | DFW | Dallas/Fort Worth, TX | 1039705 | 30397 | ATL | ... | 6.0 | 2317.0 | 2.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 24 columns
Data Cleaning¶
In [3]:
df.columns
Out[3]:
Index(['FL_DATE', 'OP_UNIQUE_CARRIER', 'TAIL_NUM', 'ORIGIN_AIRPORT_SEQ_ID',
'ORIGIN_CITY_MARKET_ID', 'ORIGIN', 'ORIGIN_CITY_NAME',
'DEST_AIRPORT_SEQ_ID', 'DEST_CITY_MARKET_ID', 'DEST', 'DEST_CITY_NAME',
'DEP_TIME', 'DEP_DELAY', 'TAXI_OUT', 'TAXI_IN', 'ARR_TIME', 'ARR_DELAY',
'CANCELLED', 'CANCELLATION_CODE', 'CARRIER_DELAY', 'WEATHER_DELAY',
'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY'],
dtype='object')
In [4]:
import matplotlib.pyplot as plt
from datetime import time
target_df = df.drop(columns = [
"TAIL_NUM",
"ORIGIN_AIRPORT_SEQ_ID",
"ORIGIN_CITY_MARKET_ID",
"ORIGIN",
"ORIGIN_CITY_NAME",
"DEST_AIRPORT_SEQ_ID",
"DEST_CITY_MARKET_ID",
"DEST",
"DEST_CITY_NAME",
"DEP_TIME",
"TAXI_OUT",
"TAXI_IN",
"ARR_TIME",
])
del df
In [5]:
target_df['FL_DATE'] = pd.to_datetime(target_df['FL_DATE'])
In [6]:
airline_names = pd.read_csv("../supplementary/top_18_airline_review_link.csv")
airline_names
Out[6]:
| index | OP_UNIQUE_CARRIER | count | CARRIER | CARRIERNAME | matched_name | urls | |
|---|---|---|---|---|---|---|---|
| 0 | 15 | WN | 63426 | WN | Southwest Airlines Co. | Southwest Airlines | https://www.airlinequality.com/airline-reviews... |
| 1 | 4 | DL | 43959 | DL | Delta Air Lines Inc. | Delta Air Lines | https://www.airlinequality.com/airline-reviews... |
| 2 | 1 | AA | 41637 | AA | American Airlines Inc. | American Airlines | https://www.airlinequality.com/airline-reviews... |
| 3 | 12 | OO | 34846 | OO | SkyWest Airlines Inc. | SkyWest Airlines | https://www.airlinequality.com/airline-reviews... |
| 4 | 14 | UA | 28030 | UA | United Air Lines Inc. | United Airlines | https://www.airlinequality.com/airline-reviews... |
| 5 | 3 | B6 | 12805 | B6 | JetBlue Airways | Jetblue Airways | https://www.airlinequality.com/airline-reviews... |
| 6 | 2 | AS | 9965 | AS | Alaska Airlines Inc. | Alaska Airlines | https://www.airlinequality.com/airline-reviews... |
| 7 | 10 | NK | 8757 | NK | Spirit Air Lines | Spirit Airlines | https://www.airlinequality.com/airline-reviews... |
| 8 | 6 | F9 | 6270 | F9 | Frontier Airlines Inc. | Frontier Airlines | https://www.airlinequality.com/airline-reviews... |
| 9 | 8 | HA | 3679 | HA | Hawaiian Airlines Inc. | Hawaiian Airlines | https://www.airlinequality.com/airline-reviews... |
| 10 | 7 | G4 | 3477 | G4 | Allegiant Air | Allegiant Air | https://www.airlinequality.com/airline-reviews... |
| 11 | 13 | QX | 1000 | QX | Horizon Air | Horizon Air | https://www.airlinequality.com/airline-reviews... |
In [7]:
airport_set_df = target_df.OP_UNIQUE_CARRIER.drop_duplicates()
airport_review = pd.read_csv("../supplementary/Airline-Reviews-Table.csv")
airport_review = airport_review[airport_review.AIRLINE_CODE.isin(airport_set_df)]
airport_default_score = airport_review.groupby("AIRLINE_CODE")[[
'ratingValue',
'seatComfort',
'foodBeverage',
'cabinService',
'inflightEntertainment',
'wifiConnectivity',
'groundService',
'valueForMoney',
'sentiment'
]].mean().reset_index()
airport_review_count = airport_review.groupby("AIRLINE_CODE").size().reset_index().rename(columns = {0:"reviewCount"})
airport_review_merged = pd.merge(airport_review_count, airport_default_score, how = "inner", on = "AIRLINE_CODE")
airport_review_merged = pd.merge(airport_review_merged, airline_names, how = "inner", left_on = "AIRLINE_CODE", right_on = "OP_UNIQUE_CARRIER")
airport_review_merged.head()
C:\Users\wongh\AppData\Local\Temp\ipykernel_7400\2395432350.py:2: DtypeWarning: Columns (14) have mixed types. Specify dtype option on import or set low_memory=False.
Out[7]:
| AIRLINE_CODE | reviewCount | ratingValue | seatComfort | foodBeverage | cabinService | inflightEntertainment | wifiConnectivity | groundService | valueForMoney | sentiment | index | OP_UNIQUE_CARRIER | count | CARRIER | CARRIERNAME | matched_name | urls | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AA | 5000 | 0.214920 | 1.960084 | 1.860507 | 2.159644 | 1.973280 | 1.692893 | 1.638175 | 1.591600 | -0.785200 | 1 | AA | 41637 | AA | American Airlines Inc. | American Airlines | https://www.airlinequality.com/airline-reviews... |
| 1 | AS | 794 | 0.376952 | 2.656944 | 2.542751 | 2.850350 | 2.487298 | 2.416867 | 2.352863 | 2.287154 | -0.400504 | 2 | AS | 9965 | AS | Alaska Airlines Inc. | Alaska Airlines | https://www.airlinequality.com/airline-reviews... |
| 2 | B6 | 1422 | 0.279817 | 2.480945 | 2.373869 | 2.474046 | 2.544898 | 2.353963 | 1.906773 | 1.931083 | -0.637131 | 3 | B6 | 12805 | B6 | JetBlue Airways | Jetblue Airways | https://www.airlinequality.com/airline-reviews... |
| 3 | DL | 2667 | 0.357593 | 2.567887 | 2.569762 | 2.851837 | 2.887197 | 2.320190 | 2.324231 | 2.224681 | -0.432321 | 4 | DL | 43959 | DL | Delta Air Lines Inc. | Delta Air Lines | https://www.airlinequality.com/airline-reviews... |
| 4 | F9 | 3311 | 0.158774 | 1.443140 | 1.258427 | 1.772019 | 1.050761 | 1.061650 | 1.361199 | 1.380248 | -0.887043 | 6 | F9 | 6270 | F9 | Frontier Airlines Inc. | Frontier Airlines | https://www.airlinequality.com/airline-reviews... |
In [8]:
target_airport_df = pd.merge(
target_df,
airport_review_merged,
how = "inner",
left_on = "OP_UNIQUE_CARRIER",
right_on = "AIRLINE_CODE",
suffixes = ["", "_origin"]
)
target_airport_df = pd.merge(
target_airport_df,
airline_names,
how = "inner",
on = "OP_UNIQUE_CARRIER",
)
target_airport_df.rename(columns = {
'reviewCount':'Review Count',
'ratingValue':'Rating Value',
'seatComfort': "Seat Comfort",
'foodBeverage': "Food & Beverage",
'cabinService': "Cabin Service",
'inflightEntertainment': "Inflight Entertainment",
'wifiConnectivity': "Wifi Connectivity",
'groundService': "Ground Service",
'valueForMoney': "Value for Money",
'sentiment': "Sentiment"
}, inplace = True)
target_airport_df.drop(columns = ['OP_UNIQUE_CARRIER'], inplace = True)
target_airport_df['CANCELLATION_CODE'] = target_airport_df['CANCELLATION_CODE'].fillna("Not")
target_airport_df.head()
Out[8]:
| FL_DATE | DEP_DELAY | ARR_DELAY | CANCELLED | CANCELLATION_CODE | CARRIER_DELAY | WEATHER_DELAY | NAS_DELAY | SECURITY_DELAY | LATE_AIRCRAFT_DELAY | ... | CARRIER_x | CARRIERNAME_x | matched_name_x | urls_x | index_y | count_y | CARRIER_y | CARRIERNAME_y | matched_name_y | urls_y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2014-07-01 | -5.0 | 9.0 | 0.0 | Not | NaN | NaN | NaN | NaN | NaN | ... | AA | American Airlines Inc. | American Airlines | https://www.airlinequality.com/airline-reviews... | 1 | 41637 | AA | American Airlines Inc. | American Airlines | https://www.airlinequality.com/airline-reviews... |
| 1 | 2014-07-01 | -4.0 | 0.0 | 0.0 | Not | NaN | NaN | NaN | NaN | NaN | ... | AA | American Airlines Inc. | American Airlines | https://www.airlinequality.com/airline-reviews... | 1 | 41637 | AA | American Airlines Inc. | American Airlines | https://www.airlinequality.com/airline-reviews... |
| 2 | 2014-07-01 | -2.0 | -9.0 | 0.0 | Not | NaN | NaN | NaN | NaN | NaN | ... | AA | American Airlines Inc. | American Airlines | https://www.airlinequality.com/airline-reviews... | 1 | 41637 | AA | American Airlines Inc. | American Airlines | https://www.airlinequality.com/airline-reviews... |
| 3 | 2014-07-01 | 14.0 | 4.0 | 0.0 | Not | NaN | NaN | NaN | NaN | NaN | ... | AA | American Airlines Inc. | American Airlines | https://www.airlinequality.com/airline-reviews... | 1 | 41637 | AA | American Airlines Inc. | American Airlines | https://www.airlinequality.com/airline-reviews... |
| 4 | 2014-07-01 | 1.0 | 2.0 | 0.0 | Not | NaN | NaN | NaN | NaN | NaN | ... | AA | American Airlines Inc. | American Airlines | https://www.airlinequality.com/airline-reviews... | 1 | 41637 | AA | American Airlines Inc. | American Airlines | https://www.airlinequality.com/airline-reviews... |
5 rows × 34 columns
Visualization¶
Airport Review¶
First of all, let's have a look at the Review Data we collected
In [9]:
airport_review_merged.head()
Out[9]:
| AIRLINE_CODE | reviewCount | ratingValue | seatComfort | foodBeverage | cabinService | inflightEntertainment | wifiConnectivity | groundService | valueForMoney | sentiment | index | OP_UNIQUE_CARRIER | count | CARRIER | CARRIERNAME | matched_name | urls | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AA | 5000 | 0.214920 | 1.960084 | 1.860507 | 2.159644 | 1.973280 | 1.692893 | 1.638175 | 1.591600 | -0.785200 | 1 | AA | 41637 | AA | American Airlines Inc. | American Airlines | https://www.airlinequality.com/airline-reviews... |
| 1 | AS | 794 | 0.376952 | 2.656944 | 2.542751 | 2.850350 | 2.487298 | 2.416867 | 2.352863 | 2.287154 | -0.400504 | 2 | AS | 9965 | AS | Alaska Airlines Inc. | Alaska Airlines | https://www.airlinequality.com/airline-reviews... |
| 2 | B6 | 1422 | 0.279817 | 2.480945 | 2.373869 | 2.474046 | 2.544898 | 2.353963 | 1.906773 | 1.931083 | -0.637131 | 3 | B6 | 12805 | B6 | JetBlue Airways | Jetblue Airways | https://www.airlinequality.com/airline-reviews... |
| 3 | DL | 2667 | 0.357593 | 2.567887 | 2.569762 | 2.851837 | 2.887197 | 2.320190 | 2.324231 | 2.224681 | -0.432321 | 4 | DL | 43959 | DL | Delta Air Lines Inc. | Delta Air Lines | https://www.airlinequality.com/airline-reviews... |
| 4 | F9 | 3311 | 0.158774 | 1.443140 | 1.258427 | 1.772019 | 1.050761 | 1.061650 | 1.361199 | 1.380248 | -0.887043 | 6 | F9 | 6270 | F9 | Frontier Airlines Inc. | Frontier Airlines | https://www.airlinequality.com/airline-reviews... |
In [10]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
corr_matrix = airport_review_merged.loc[:, 'ratingValue':'sentiment'].corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', mask = mask, vmin = -1, vmax = 1)
plt.title('Airline Review Score Correlation')
plt.savefig("Airline_Review_Correlation")
Summary
The majority of the score given by users are very correlated,
quite a bit better than airport views but still.
Airport Ranking¶
In [13]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
In [14]:
fig = go.Figure()
fig = px.scatter(
data_frame = airport_review_merged,
x='sentiment',
y='ratingValue',
color='ratingValue',
size = 'reviewCount',
color_continuous_scale = 'viridis',
text='CARRIERNAME',
log_y = True
)
fig.update_traces(textposition='top center')
fig.for_each_annotation(
lambda a: a.update(text= "Medium Airport" if a.text == "Large=False" else "Large Airport")
)
fig.update_layout(
height=600,
width=1000,
title_text="Airport Sentiment Visualization",
showlegend=False
)
fig.show()
In [13]:
airport_review_merged
Out[13]:
| AIRLINE_CODE | reviewCount | ratingValue | seatComfort | foodBeverage | cabinService | inflightEntertainment | wifiConnectivity | groundService | valueForMoney | sentiment | index | OP_UNIQUE_CARRIER | count | CARRIER | CARRIERNAME | matched_name | urls | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AA | 5000 | 0.214920 | 1.960084 | 1.860507 | 2.159644 | 1.973280 | 1.692893 | 1.638175 | 1.591600 | -0.785200 | 1 | AA | 41637 | AA | American Airlines Inc. | American Airlines | https://www.airlinequality.com/airline-reviews... |
| 1 | AS | 794 | 0.376952 | 2.656944 | 2.542751 | 2.850350 | 2.487298 | 2.416867 | 2.352863 | 2.287154 | -0.400504 | 2 | AS | 9965 | AS | Alaska Airlines Inc. | Alaska Airlines | https://www.airlinequality.com/airline-reviews... |
| 2 | B6 | 1422 | 0.279817 | 2.480945 | 2.373869 | 2.474046 | 2.544898 | 2.353963 | 1.906773 | 1.931083 | -0.637131 | 3 | B6 | 12805 | B6 | JetBlue Airways | Jetblue Airways | https://www.airlinequality.com/airline-reviews... |
| 3 | DL | 2667 | 0.357593 | 2.567887 | 2.569762 | 2.851837 | 2.887197 | 2.320190 | 2.324231 | 2.224681 | -0.432321 | 4 | DL | 43959 | DL | Delta Air Lines Inc. | Delta Air Lines | https://www.airlinequality.com/airline-reviews... |
| 4 | F9 | 3311 | 0.158774 | 1.443140 | 1.258427 | 1.772019 | 1.050761 | 1.061650 | 1.361199 | 1.380248 | -0.887043 | 6 | F9 | 6270 | F9 | Frontier Airlines Inc. | Frontier Airlines | https://www.airlinequality.com/airline-reviews... |
| 5 | G4 | 1575 | 0.228635 | 1.798883 | 1.643929 | 2.144672 | 1.198675 | 1.237089 | 1.711067 | 1.782222 | -0.709206 | 7 | G4 | 3477 | G4 | Allegiant Air | Allegiant Air | https://www.airlinequality.com/airline-reviews... |
| 6 | NK | 4424 | 0.205922 | 1.551877 | 1.351816 | 1.938198 | 1.100457 | 1.119522 | 1.555269 | 1.577662 | -0.767631 | 10 | NK | 8757 | NK | Spirit Air Lines | Spirit Airlines | https://www.airlinequality.com/airline-reviews... |
| 7 | OO | 11 | 0.672727 | 3.272727 | 1.833333 | 3.545455 | 0.750000 | NaN | 4.142857 | 3.090909 | 0.272727 | 12 | OO | 34846 | OO | SkyWest Airlines Inc. | SkyWest Airlines | https://www.airlinequality.com/airline-reviews... |
| 8 | UA | 4749 | 0.278227 | 2.134851 | 2.015647 | 2.399780 | 2.118077 | 1.797923 | 1.925506 | 1.883765 | -0.631501 | 14 | UA | 28030 | UA | United Air Lines Inc. | United Airlines | https://www.airlinequality.com/airline-reviews... |
| 9 | WN | 1498 | 0.363818 | 2.465083 | 2.495513 | 2.859307 | 2.282158 | 2.291667 | 2.362932 | 2.362725 | -0.423231 | 15 | WN | 63426 | WN | Southwest Airlines Co. | Southwest Airlines | https://www.airlinequality.com/airline-reviews... |
Review Performance per Airport¶
In [15]:
df_columns = [
'reviewCount',
'seatComfort',
'foodBeverage',
'cabinService',
'inflightEntertainment',
'wifiConnectivity',
'groundService',
'valueForMoney',
]
df_better_names = [
"Review Count",
"Seat Comfort",
"Food & Beverage",
"Cabin Service",
"Inflight Entertainment",
"Wifi Connectivity",
"Ground Service",
"Value for Money",
]
fig = make_subplots(rows=4, cols=2, subplot_titles=df_better_names)
col = [1, 2]*4
row = [1, 1, 2, 2, 3, 3, 4, 4]
for c, r, column, better_column in zip(col, row, df_columns, df_better_names):
airport_review_merged.sort_values(column, ascending = False, inplace = True)
trace = go.Bar(
x=airport_review_merged['CARRIERNAME'],
y=airport_review_merged[column],
marker=dict(
color = airport_review_merged[column],
colorscale='viridis'
)
)
fig.add_trace(
trace,
row=r,
col=c
)
fig.update_layout(
height=1000,
title_text="Individual Scoring Attributes of Airline",
showlegend=False
)
fig.show()
Delay vs Ranking¶
In [16]:
target_airport_df.columns
Out[16]:
Index(['FL_DATE', 'DEP_DELAY', 'ARR_DELAY', 'CANCELLED', 'CANCELLATION_CODE',
'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY',
'LATE_AIRCRAFT_DELAY', 'AIRLINE_CODE', 'Review Count', 'Rating Value',
'Seat Comfort', 'Food & Beverage', 'Cabin Service',
'Inflight Entertainment', 'Wifi Connectivity', 'Ground Service',
'Value for Money', 'Sentiment', 'index_x', 'OP_UNIQUE_CARRIER_origin',
'count_x', 'CARRIER_x', 'CARRIERNAME_x', 'matched_name_x', 'urls_x',
'index_y', 'count_y', 'CARRIER_y', 'CARRIERNAME_y', 'matched_name_y',
'urls_y'],
dtype='object')
In [17]:
train_columns = [
'Rating Value',
'Seat Comfort',
'Food & Beverage',
'Cabin Service',
'Inflight Entertainment',
'Wifi Connectivity',
'Ground Service',
'Value for Money',
'Sentiment'
]
label_columns = [
"DEP_DELAY",
"ARR_DELAY",
"CARRIER_DELAY",
"WEATHER_DELAY",
"NAS_DELAY",
"SECURITY_DELAY",
"LATE_AIRCRAFT_DELAY",
]
Delay Data Size >0
In [19]:
cal_df = target_airport_df[['CARRIERNAME_x'] + label_columns]
delay = cal_df.groupby('CARRIERNAME_x')[label_columns].apply(lambda x: (x > 0).mean()*100).reset_index().sort_values("ARR_DELAY", ascending = True)
big_delay = cal_df.groupby('CARRIERNAME_x')[label_columns].apply(lambda x: (x > 60).mean()*100).reset_index().sort_values("ARR_DELAY", ascending = True)
In [21]:
import pandas as pd
import matplotlib.pyplot as plt
df1 = delay
df2 = big_delay
fig, ax = plt.subplots(figsize=(12, 12))
bar_width = 0.35
index = range(len(df1))
bars3 = ax.barh([i for i in index], [100 for _ in range(len(df2['ARR_DELAY']))], bar_width, label='All Flight', color='royalblue')
bars1 = ax.barh(index, df1['ARR_DELAY'], bar_width, label='Arrival Delay > 0', color='pink')
bars2 = ax.barh([i for i in index], df2['ARR_DELAY'], bar_width, label='Arrival Delay > 60', color='red')
# Adding labels, title, and legend
ax.set_xlabel('Percentage (%)')
ax.set_ylabel('Airline')
ax.set_title('(%) of Delay Flights by Airline')
ax.set_yticks([i for i in index])
ax.set_yticklabels(df1['CARRIERNAME_x'], rotation=0)
ax.legend()
plt.savefig("Delay_by_Airline")
In [22]:
target_airport_df.columns
Out[22]:
Index(['FL_DATE', 'DEP_DELAY', 'ARR_DELAY', 'CANCELLED', 'CANCELLATION_CODE',
'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY',
'LATE_AIRCRAFT_DELAY', 'AIRLINE_CODE', 'Review Count', 'Rating Value',
'Seat Comfort', 'Food & Beverage', 'Cabin Service',
'Inflight Entertainment', 'Wifi Connectivity', 'Ground Service',
'Value for Money', 'Sentiment', 'index_x', 'OP_UNIQUE_CARRIER_origin',
'count_x', 'CARRIER_x', 'CARRIERNAME_x', 'matched_name_x', 'urls_x',
'index_y', 'count_y', 'CARRIER_y', 'CARRIERNAME_y', 'matched_name_y',
'urls_y'],
dtype='object')
In [23]:
target_airport_df = target_airport_df[(target_airport_df['FL_DATE'].dt.month >= 5) & (target_airport_df['FL_DATE'].dt.month <= 8)]
target_airport_df.nunique()
Out[23]:
FL_DATE 1230 DEP_DELAY 1509 ARR_DELAY 1559 CANCELLED 2 CANCELLATION_CODE 5 CARRIER_DELAY 1314 WEATHER_DELAY 704 NAS_DELAY 653 SECURITY_DELAY 198 LATE_AIRCRAFT_DELAY 1057 AIRLINE_CODE 10 Review Count 10 Rating Value 10 Seat Comfort 10 Food & Beverage 10 Cabin Service 10 Inflight Entertainment 10 Wifi Connectivity 9 Ground Service 10 Value for Money 10 Sentiment 10 index_x 10 OP_UNIQUE_CARRIER_origin 10 count_x 10 CARRIER_x 10 CARRIERNAME_x 10 matched_name_x 10 urls_x 10 index_y 10 count_y 10 CARRIER_y 10 CARRIERNAME_y 10 matched_name_y 10 urls_y 10 dtype: int64
In [24]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
corr_matrix = target_airport_df.loc[:, train_columns + label_columns].corr()
plt.figure(figsize=(10, 5))
sns.heatmap(corr_matrix.loc[train_columns, label_columns], annot=True, cmap='coolwarm')
plt.title('Airline Review Score Correlation with Delays')
plt.savefig("Airline Review Correlation with Delay")
In [32]:
import plotly.express as px
fig = px.box(
target_airport_df.sample(100000, replace = False).sort_values(["ARR_DELAY"]),
x="CARRIERNAME_x",
y="ARR_DELAY",
log_y = True,
color_discrete_map = {True: 'red', False: 'blue'}
)
fig.update_layout(
height=400,
title_text=f"Airline vs Arrival Delay",
showlegend=True
)
fig.show()
Departure Delay Airport Route¶
In [ ]:
target_airport_df
In [ ]:
target_airport_df['Route'] = target_airport_df.apply(lambda row: row['ORIGIN_CITY_NAME'] + " -> " + row['DEST_CITY_NAME'], axis = 1)
target_airport_df.head()
In [ ]:
target_airport_df_temp = target_airport_df.groupby('Route').size().reset_index().rename(columns = {0: 'DEP_DELAY'})
target_airport_df_temp.sort_values("DEP_DELAY", ascending = False, inplace = True)
target_airport_df_temp = target_airport_df_temp.iloc[:10, :]
In [ ]:
import plotly.express as px
import numpy as np
np.random.seed(1)
top10_busy_route = pd.merge(
target_airport_df,
target_airport_df_temp,
how = "inner",
on = 'Route',
suffixes = ["", "_mean"]
).sample(100000, replace = False)
fig = px.box(
top10_busy_route.sort_values("DEP_DELAY_mean", ascending = False),
x="Route",
y="DEP_DELAY",
log_y = True,
color_discrete_map = {True: 'red', False: 'blue'}
)
fig.update_layout(
height=1000,
title_text=f"Aiport Departure Delay",
showlegend=True
)
fig.show()
Cancelled¶
In [33]:
train_columns = [
'Rating Value',
'Seat Comfort',
'Food & Beverage',
'Cabin Service',
'Inflight Entertainment',
'Wifi Connectivity',
'Ground Service',
'Value for Money',
'Sentiment'
]
label_columns = [
"Carrier",
"Weather",
"NAS",
"Security",
"Not"
]
In [34]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
temp = pd.concat([
target_airport_df,
pd.get_dummies(target_airport_df['CANCELLATION_CODE'], prefix = "Cancellation")
], axis = 1)
temp.rename(columns = {
"Cancellation_A": "Carrier",
"Cancellation_B": "Weather",
"Cancellation_C": "NAS",
"Cancellation_D": "Security",
"Cancellation_Not": "Not"
}, inplace = True)
corr_matrix = temp.loc[:, train_columns + label_columns].corr()
plt.figure(figsize=(10, 5))
sns.heatmap(corr_matrix.loc[train_columns, label_columns], annot=True, cmap='coolwarm')
plt.title('Airport Review Score Correlation with Cancellation Reason')
plt.show()
In [42]:
import pandas as pd
import plotly.express as px
hist_df = target_airport_df.groupby(["CARRIERNAME_x", "CANCELLATION_CODE"]).size().reset_index().rename(columns = {0: "Count"})
hist_df.replace({"A": "Carrier", "B":"Weather", "C":"NAS", "D":"Security"}, inplace = True)
count_df = target_airport_df.groupby(["CARRIERNAME_x"]).size().reset_index().rename(columns = {0: "All_Count"})
hist_df = pd.merge(hist_df, count_df, on = "CARRIERNAME_x")
hist_df['Percent'] = hist_df.apply(lambda row: row['Count']/row['All_Count']*100, axis = 1)
hist_df = hist_df[hist_df['CANCELLATION_CODE'] != "Not"]
hist_df.sort_values(["CARRIERNAME_x", "CANCELLATION_CODE"], inplace = True)
fig = px.histogram(hist_df, x='CARRIERNAME_x', y='Percent', color='CANCELLATION_CODE' ,title='Airline Cancellation Code')
fig.show()
In [37]:
corr_matrix = temp.loc[:, train_columns + label_columns].corr()
plt.figure(figsize=(10, 5))
sns.heatmap(corr_matrix.loc[train_columns, label_columns], annot=True, cmap='coolwarm')
plt.title('Airline Review Score Correlation with Cancellation Reason')
plt.savefig("Airline Review vs Cancellation")